import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
dataset = pd.read_csv('./cleaned_df_with_manufact.csv', keep_default_na=False) #Avoid 'null' in CSV to be parsed as NaN
dataset.describe()
dataset.head(3)
dataset.info()
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(dataset['price'], 100)
ax.set_title('Distribution of Price'), ax.set_xlabel('Price'), ax.set_ylabel('Count')
fig.show()
dataset['year'].mean()
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(dataset['year'], len(dataset['year'].value_counts()), align='right')
ax.set_title('Distribution of Vehicle Year'), ax.set_xlabel('Year'), ax.set_ylabel('Count')
ax.arrow(2009.2,5900,0,-2500,head_width=0.5,head_length=150,color='gray')
ax.axvline(x=dataset['year'].mean(),color='black',label='mean')
ax.axvline(x=dataset['year'].median(),color='r',label='median')
ax.text(2017,8300,'2017',horizontalalignment='center')
ax.text(2009,6000,'2009',horizontalalignment='center')
ax.legend()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['manufacturer'].value_counts().iloc[:10].index, dataset['model'].value_counts().iloc[:10].values)
ax.set_title('Top 10 Popular Make'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['model'].value_counts().iloc[:10].index, dataset['model'].value_counts().iloc[:10].values)
ax.set_title('Top 10 Popular Model'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['condition'].value_counts().index, dataset['condition'].value_counts().values)
ax.set_title('Vehicle Condition'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
Clearly, a large amount (25416) of vehicle condition are not reported in this dataset.
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['cylinders'].value_counts().index, dataset['cylinders'].value_counts().values)
ax.set_title('Engine Cylinder Count'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['fuel'].value_counts().index, dataset['fuel'].value_counts().values)
ax.set_title('Fuel Type'), ax.set_ylabel('Count')
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(dataset['odometer'], 100, align='right')
ax.set_title('Mileage'), ax.set_xlabel('Mile'), ax.set_ylabel('Count')
ax.axvline(x=dataset['odometer'].mean(),color='black',label='mean')
ax.axvline(x=dataset['odometer'].median(),color='r',label='median')
ax.legend()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['transmission'].value_counts().index, dataset['transmission'].value_counts().values)
ax.set_title('Transmission Type'), ax.set_ylabel('Count')
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(dataset['drive'].value_counts().index, dataset['drive'].value_counts().values)
ax.set_title('Drive Type'), ax.set_ylabel('Count')
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['type'].value_counts().index, dataset['type'].value_counts().values)
ax.set_title('Vehicle Type'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(dataset['paint_color'].value_counts().index, dataset['paint_color'].value_counts().values)
ax.set_title('Paint Color'), ax.set_xlabel('Count'), ax.invert_yaxis()
fig.show()
fig, ax = plt.subplots(figsize=(8,8))
ax.barh(dataset['state'].value_counts().index, dataset['state'].value_counts().values)
ax.set_title('State'), ax.set_ylabel('Count'), ax.invert_yaxis()
fig.tight_layout()
fig.show()
sns.pairplot(pd.concat([dataset.price,dataset.odometer,dataset.year], axis=1), corner=True, plot_kws=dict(marker="+", linewidth=1))
plt.show()
with sns.axes_style('white'):
sns.jointplot('price', 'odometer', dataset, kind='hex')
with sns.axes_style('white'):
sns.jointplot('price', 'year', dataset, kind='hex')
#sns.regplot('odometer','price',dataset)
dd = pd.concat([dataset,pd.get_dummies(dataset.condition)], axis=1)
dd = dd.drop(columns=['null'])
dd = pd.concat([dd,pd.get_dummies(dataset.cylinders)], axis=1)
dd = dd.drop(columns=['null'])
dd = pd.concat([dd,pd.get_dummies(dataset.fuel)], axis=1)
dd = pd.concat([dd,pd.get_dummies(dataset.transmission)], axis=1)
dd = pd.concat([dd,pd.get_dummies(dataset.drive)], axis=1)
dd = pd.concat([dd,pd.get_dummies(dataset.type)], axis=1)
#dd = pd.concat([dd,pd.get_dummies(dataset.paint_color)], axis=1)
corr = dd.corr()
fig, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, mask=np.triu(np.ones_like(corr, dtype=bool)), center=0,
square=True, linewidths=.1, cbar_kws={"shrink": .5})
fig.show()
corr = dataset.corr()
fig, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, center=0, annot=True,
square=True, linewidths=.1, cbar_kws={"shrink": .5})
fig.show()
dataset.head(3)
sns.catplot(x='price',y='condition',data=dataset[['price','condition']], kind='boxen')
plt.show()
sns.catplot(x='odometer',y='condition',data=dataset[['odometer','condition']], kind='violin')
plt.show()
sns.catplot(x='price',y='type',data=dataset[['price','type']], kind='boxen')
plt.show()
sns.catplot(x='price',y='fuel',col='drive',hue='transmission',
data=dataset[['price','transmission','drive','fuel']],kind='boxen')
plt.show()
sns.catplot(x='price',y='cylinders',data=dataset[['price','cylinders']], kind='boxen')
plt.show()
sns.catplot(x='price',y='paint_color',data=dataset[['price','paint_color']], kind='boxen',palette=sns.color_palette(['gray','white','lime','silver','blue','brown','purple','red','black','orange','green','yellow']))
plt.show()